#####################################
# LASSO and Multiple Imputation Code
# Requires: dta1, dta2a, dta2b, 
# X-Ntl_Banks data files in same folder 
# as code.
#####################################

library(readstata13)
library(stargazer)
library(MASS)
library(plm)
library(glmmML)
library(plyr)
library(glmmADMB)
library(pglm)
library(survival)
library(ggplot2)
library(ggfortify)
library(sandwich)
library(lmtest)
library(AER)
library(glmnet)
library(mice)
library(VIM)
library(Hmisc)
library(mitools)

setwd("/Users/austinjordan/Documents/School/Harvard/2001/Paper_replication")


##########################
# LASSO for all of Table 1
##########################

# The next 4 subsections perform the LASSO procedure to select 
# variables for all table 1 models.

########################
# PRIO Logit (Table 1.1)

# Start with data 2a, used for table 1.1 and 1.2
dta.2a <- read.dta13("Replication_2a.dta")
# Generate a new variable: GDP Growth
dta.2a$gdpgrow <- dta.2a$gle_gdpgrowth / 100

# Only look at columns with numeric or integer data
holder_2a <- c()
for (i in 1:ncol(dta.2a)){
  if (class(dta.2a[,i]) == 'numeric' | class(dta.2a[,i]) == 'integer'){
    holder_2a <- append(holder_2a,i)
  }
}
lasso_dta2a <- dta.2a[,holder_2a]

# Only use columns where the proportion of NA values is less than 70%
holder_2a_2 <- c()
for (i in 1:ncol(lasso_dta2a)){
  if (sum(is.na(lasso_dta2a[,i]))/length(lasso_dta2a[,i])<.7){
    holder_2a_2 <- append(holder_2a_2, i)
  }
}
lasso_dta2a_shortened <- lasso_dta2a[,holder_2a_2]

# Have to only look at columns with eventdummy data, since that is our DV
lasso_dta2a_shortest <- lasso_dta2a_shortened[!is.na(lasso_dta2a_shortened$eventdummy),]

# Get rows of complete data in restricted dataset
lasso_2a_cov <- as.matrix(lasso_dta2a_shortest[which(rowSums(is.na(lasso_dta2a_shortest))==0),])
# Remove DV from covariate matrix
lasso_2a_cov <- lasso_2a_cov[,-206]
# Get vector of DV
lasso_2a_response <- lasso_dta2a_shortest$eventdummy[which(rowSums(is.na(lasso_dta2a_shortest))==0)]

cv.fit.table.1.1 <- cv.glmnet(lasso_2a_cov, lasso_2a_response, nlambda = 100, 
                              type.measure = "class", family = "binomial")
fit.table.1.1 <- glmnet(x = lasso_2a_cov, y = lasso_2a_response, alpha = 1,
                        lambda = cv.fit.table.1.1$lambda.1se,family = 'binomial')

coef.min.table.1.1 <- coef(cv.fit.table.1.1, s = "lambda.1se")
final.table.1.1 <- coef.min.table.1.1[which(coef.min.table.1.1!=0),]

#########################
# PRIO NegBin (Table 1.2)

# Use `lasso_dta2a_shortened` from 1.1, since it filters out all non-integer and non-numeric
# variables.
# Have to only look at columns with nevent data, since that is our DV
lasso_dta2a_shortest_1.2 <- lasso_dta2a_shortened[!is.na(lasso_dta2a_shortened$nevent),]

# Set covariate matrix for 1.2
lasso_2a_cov <- as.matrix(lasso_dta2a_shortest_1.2[which(rowSums(is.na(lasso_dta2a_shortest_1.2))==0),])

# Set DV vector for 1.2 (using only rows for which we have data)
lasso_2a_2_response <- lasso_dta2a_shortest_1.2$nevent[which(rowSums(is.na(lasso_dta2a_shortest_1.2))==0)]

# Remove DV from covariate matrix
lasso_2a_2_cov <- lasso_2a_cov[,-201]

cv.fit.table.1.2 <- cv.glmnet(lasso_2a_2_cov, lasso_2a_2_response, nlambda = 100
                              , family = 'poisson')
fit.table.1.2 <- glmnet(x = lasso_2a_2_cov, y = lasso_2a_2_response, alpha = 1, 
                        lambda = cv.fit.table.1.2$lambda.1se, family = 'poisson')

coef.min.table.1.2 <- coef(cv.fit.table.1.2, s = 'lambda.1se')
final.table.1.2 <- coef.min.table.1.2[which(coef.min.table.1.2!=0),]

#########################
# Banks Logit (Table 1.3)

# Load the banks data
banks <- read.dta13("X-Ntl_Banks.dta")
banks$gdpgrow <- banks$gle_gdpgrowth / 100
banks$banksdummy <- as.numeric(banks$banksconflict0 == 0)


# Only look at columns with numeric or integer data
holder_banks_1 <- c()
for (i in 1:ncol(banks)){
  if (class(banks[,i]) == 'numeric' | class(banks[,i]) == 'integer'){
    holder_banks_1 <- append(holder_banks_1,i)
  }
}

# Only use columns where the proportion of NA values is less than 10%
lasso_banks <- banks[,holder_banks_1]
holder_banks_2 <- c()
for (i in 1:ncol(lasso_banks)){
  if (sum(is.na(lasso_banks[,i]))/length(lasso_banks[,i])<.1){
    holder_banks_2 <- append(holder_banks_2, i)
  }
}
lasso_banks_shortened <- lasso_banks[,holder_banks_2]

# Have to only look at columns with banksdummy data, since that is our DV
lasso_banks_shortest <- lasso_banks_shortened[!is.na(lasso_banks_shortened$banksdummy),]

# Get rows of complete data in most restricted dataset
lasso_banks_cov <- as.matrix(lasso_banks_shortest[which(rowSums(is.na(lasso_banks_shortest))==0),])
# Remove DV from covariate matrix
dv_col <- which(colnames(lasso_banks_cov) == "banksdummy")
# Remove perfectly colinear variables (similar code to the line above used to find all others)
lasso_banks_cov <- lasso_banks_cov[,-c(193,194,195,196,197,198,199,
                                       200,201,227,228,229,230,231,232,233,234,235,239)]
# Get vector of DV
lasso_banks_response <- lasso_banks_shortest$banksdummy[which(rowSums(is.na(lasso_banks_shortest))==0)]

cv.fit.table.1.3 <- cv.glmnet(lasso_banks_cov, lasso_banks_response, nlambda = 100, 
                              type.measure = "class", family = "binomial")
fit.table.1.3 <- glmnet(x = lasso_banks_cov, y = lasso_banks_response, alpha = 1,
                        lambda = cv.fit.table.1.3$lambda.1se,family = 'binomial')

coef.min.table.1.3 <- coef(cv.fit.table.1.3, s = "lambda.1se")
final.table.1.3 <- coef.min.table.1.3[which(coef.min.table.1.3!=0),]

#########################
# Banks NegBin (Table 1.4)

# Use `lasso_banks_shortened` from 1.3, since it filters out all non-integer and non-numeric
# variables.
# Have to only look at columns with banksconflict0 data, since that is our DV
lasso_banks_shortest_1.4 <- lasso_banks_shortened[!is.na(lasso_banks_shortened$banksconflict0),]

# Get rows of complete data in most restricted dataset
lasso_banks_cov_1.4 <- as.matrix(lasso_banks_shortest_1.4[which(rowSums(is.na(lasso_banks_shortest_1.4))==0),])
# Remove perfectly colinear variables
lasso_banks_cov_1.4 <- lasso_banks_cov_1.4[,-c(193,194,195,196,197,198,199,
                                               200,201,227,228,229,230,231,232,233,234,235,239)]
#Get vector of DV
lasso_banks_response_1.4 <- lasso_banks_shortest_1.4$banksconflict0[which(rowSums(is.na(lasso_banks_shortest_1.4))==0)]

cv.fit.table.1.4 <- cv.glmnet(lasso_banks_cov_1.4, lasso_banks_response_1.4, nlambda = 100, 
                              type.measure = "class", family = "poisson")
fit.table.1.4 <- glmnet(x = lasso_banks_cov_1.4, y = lasso_banks_response_1.4, alpha = 1,
                        lambda = cv.fit.table.1.4$lambda.1se,family = 'poisson')

coef.min.table.1.4 <- coef(cv.fit.table.1.4, s = "lambda.1se")
final.table.1.4 <- coef.min.table.1.4[which(coef.min.table.1.4!=0),]

# Call names(final.table.1.X) to see LASSO selected variables for all table 1 models.

################################################
# Re-run Table 1 regressions with both original 
# covariates and LASSO-selected covariates
# (I filter the LASSO-selected variables to only
# use ones that are reasonable (for example, using
# "year" does not make sense in the context of these
# regressions) or to exclude variables that 
# measure the same thing. This is a researcher 
# choice).
################################################

# Logit Model with PRIO Data Table 1.1
# LASSO identified variables are: openness to trade and over-urbanization
mod.11 <- glm(eventdummy~urbcon+gdpgrow+pwt_openc + overurb+factor(year)+factor(ccode),
              family=binomial(link='logit'),
              data=dta.2a[dta.2a$gle_pop > 1000, ])
summary(mod.11)

# Negative Binomial Model with PRIO Data Table 1.2
# LASSO identified variabels are: exchange rate, number of armed conflicts (internal and external),
# exchange rate, and presence of migration restrictions
mod.12 <- glm.nb(nevent~urbcon+gdpgrow +pwt_er + ucdp_count + unna_er+migrestrict1 +
                  factor(ccode)+factor(year),
                 data = dta.2a[dta.2a$gle_pop > 1000, ])
summary(mod.12)

# Logit Model with Banks Data Table 1.3
# LASSO identified variables are: urban population (logged), over-urbanization (metric #3),
# experience of interstate war in a given year, econmic crisis in a given year, number of
# party coalitions, level of party fractionalization, number of major constitutional changes
# in a given year, and number of major cabinet changes in a given year

mod.13 <- glm(banksdummy ~ urbcon+gdpgrow+lgle_rgdp + lurbpop + overurb2 + interstatewar + 
                econcrisis + s19f5 + s20f5 + s21f2 + s22f2 + factor(year)+factor(ccode),
              family=binomial(link='logit'), data=banks[banks$gle_pop > 1000, ])
summary(mod.13)

# NegBin Model with Banks Data Table 1.4
# LASSO identified variables are: polity socre, number of armed conflicted (external and internal),
# urban population (lagged), total population (lagged), binary indicator of external or internal conflict,
# an indicator for sub-Saharan African countries, an indicator for Western Europe/North America, number
# of major constitutional changes, degree of parliamentary responsiblity, and number of changes in effective
# executive.

mod.14 <- glm.nb(banksconflict0~urbcon+gdpgrow+ p_polity2 + ucdp_count + lurbpop + lgle_pop + conflict + 
                   SubAf + WEuNAm + s21f2 + s21f7 + s22f3 + factor(year)+factor(ccode),
                 data=banks[banks$gle_pop > 1000,])
summary(mod.14)

robust.se <- list(sqrt(diag(vcov(mod.11, type="HC1"))),
                  sqrt(diag(vcov(mod.12, type="HC1"))),
                  sqrt(diag(vcov(mod.13, type="HC1"))),
                  sqrt(diag(vcov(mod.14, type="HC1"))))


# Stargazer was not producing the correct table - estimates were right, but in the wrong place.
# We use this as a template and manually fix the table.
stargazer(mod.11, mod.12, mod.13, mod.14, se = robust.se,
          keep = c("urbcon", "gdpgrow", "lgle_rgdp", "pwt_openc","overurb",
                   "pwt_er", "ucdp_count", "unna_er", "migrestrict1",  "lurbpop", "overurb2", "interstatewar",  
                   "econcrisis", "s19f5", "s20f5", "s21f2" ,"s22f2" ,  "p_polity2", "lgle_pop" , "conflict", 
                   "SubAf", "WEuNAm",  "s21f7" ,"s22f3","Constant"),
          intercept.bottom = TRUE,
          title = "\\textbf{Urban Concentration Linked to Collective Action}",
          dep.var.caption = "PRIO Data \\quad\\quad\\quad
          \\quad\\quad\\quad\\quad Banks Data",
          dep.var.labels = c("\\textbf{Model 1.1}", "\\textbf{Model 1.2}",
                             "\\textbf{Model 1.3}", "\\textbf{Model 1.4}"),
          column.labels = c("Logit", "Negative Binomial",
                            "Logit", "Negative Binomial"),
          model.numbers = FALSE,
          model.names = FALSE,
          no.space = TRUE,
          star.char = c("*", "**", "***", ""),
          covariate.labels = c("Urban Concentration", "GDP Growth",
                               "Real GDP per Capita (logged)", "Openness to Trade", 
                               "Overurbanization", "Exchange Rate", "Conflict Count (External and Internal)", "Exchange Rate (measure 2)", "Migration Restriction",
                                "Urban Population (logged)", "Overurbanization (measure 2)", "Interstate War", "Economic Crisis",
                              "Number of Party Coalitions", "Party Fractionalization", "Number Major Constitutional Changes",
                              "Number of Major Cabinet Changes", "Polity 2 Score",
                              "Total Population (logged)", "Experience of Conflict (Binary, External or Internal)", "Sub-Saharan African", 
                              "Western European/North American", "Parliamentary Responsibility", "Number Changes in Effective Executive",
                              "Constant"),
          omit.stat = c("rsq", "aic", "ll", "theta"),
          notes = "\\parbox[t]{\\textwidth}{\\textit{Note:} Robust standard errors in parentheses.
          *** p < 0.01, ** p < 0.05, * p < 0.1.}",
          notes.align = "l",
          notes.append = FALSE,
          notes.label = "",
          column.sep.width = "-5pt",
          font.size = "footnotesize")


#####################
# Multiple Imputation
#####################

# Create MI data sets
dta.2b <- read.dta13("Replication_2b.dta")
dta.1 <- read.dta13("Replication_1.dta")

dta.1$surv.obj <- with(dta.1, Surv(time = `_t0`, time2 = `_t`, event = regimefail))
dta.2b$surv.obj <- with(dta.2b, Surv(time = `_t0`, time2 =  `_t`, event = regimefail))


# Reduce all data matrices to covariates of interest
dta_1_interest <- cbind("urbcon" = dta.1$urbcon, "urbconlvl" = dta.1$urbconlvl, "_t" = dta.1$`_t`, "_t0" = dta.1$`_t0`, 
                        "regimefail" = dta.1$regimefail, "fivemillion" = dta.1$fivemillion, "gle_pop" = dta.1$gle_pop,
                        "wr_mir" = dta.1$wr_mir,  "wr_mor" = dta.1$wr_mor, "wr_spr" = dta.1$wr_spr,  "leg" = dta.1$leg, 
                        "r_atlas" = dta.1$r_atlas,  "econcrisispwt" = dta.1$econcrisispwt,  "coldwar" = dta.1$coldwar,  
                        "prevtimesinoffice" = dta.1$prevtimesinoffice, "lgle_rgdp" = dta.1$lgle_rgdp, "gdpgrow" = dta.1$gdpgrow,  
                        "conflict" = dta.1$conflict,  "lgle_pop" = dta.1$lgle_pop,  "lcapitalpop" = dta.1$lcapitalpop,  
                        "avgpop" = dta.1$avgpop,  "chga_roy" = dta.1$chga_roy, "chga_mil" = dta.1$chga_mil, 
                        "pwt_openc" = dta.1$pwt_openc,"overurb" = dta.1$overurb,
                        "pwt_er" = dta.1$pwt_er, "ucdp_count" = dta.1$ucdp_count, "unna_er" = dta.1$unna_er, 
                        "migrestrict1" = dta.1$migrestrict1,  "lurbpop" = dta.1$lurbpop, "overurb2" = dta.1$overurb2, 
                        "interstatewar" = dta.1$interstatewar,  
                        "econcrisis" = dta.1$econcrisis, "p_polity2" = dta.1$p_polity2, 
                        "SubAf" = dta.1$SubAf, "WEuNAm" = dta.1$WEuNAm, "ccode" = dta.1$ccode, "year" = dta.1$year,
                        "surv.obj" = dta.1$surv.obj)

dta_2a_interest <- cbind("urbcon" = dta.2a$urbcon, "urbconlvl" = dta.2a$urbconlvl, "_t" = dta.2a$`_t`, "_t0" = dta.2a$`_t0`, 
                         "regimefail" = dta.2a$regimefail, "fivemillion" = dta.2a$fivemillion, "gle_pop" = dta.2a$gle_pop,
                         "wr_mir" = dta.2a$wr_mir,  "wr_mor" = dta.2a$wr_mor, "wr_spr" = dta.2a$wr_spr,  "leg" = dta.2a$leg, 
                         "r_atlas" = dta.2a$r_atlas,  "econcrisispwt" = dta.2a$econcrisispwt,  "coldwar" = dta.2a$coldwar,  
                         "prevtimesinoffice" = dta.2a$prevtimesinoffice, "lgle_rgdp" = dta.2a$lgle_rgdp, "gdpgrow" = dta.2a$gdpgrow,  
                         "conflict" = dta.2a$conflict,  "lgle_pop" = dta.2a$lgle_pop,  "lcapitalpop" = dta.2a$lcapitalpop,  
                         "avgpop" = dta.2a$avgpop,  "chga_roy" = dta.2a$chga_roy, "chga_mil" = dta.2a$chga_mil,
                         "pwt_openc" = dta.2a$pwt_openc,"overurb" = dta.2a$overurb,
                         "pwt_er" = dta.2a$pwt_er, "ucdp_count" = dta.2a$ucdp_count, "unna_er" = dta.2a$unna_er, 
                         "migrestrict1" = dta.2a$migrestrict1,  "lurbpop" = dta.2a$lurbpop, "overurb2" = dta.2a$overurb2, 
                         "interstatewar" = dta.2a$interstatewar,  
                         "econcrisis" = dta.2a$econcrisis,  "p_polity2" = dta.2a$p_polity2, 
                         "SubAf" = dta.2a$SubAf, "WEuNAm" = dta.2a$WEuNAm, "nevent" = dta.2a$nevent, "eventdummy" = dta.2a$eventdummy,
                         "ccode" = dta.2a$ccode, "year" = dta.2a$year)

dta_2b_interest <- cbind("urbcon" = dta.2b$urbcon, "urbconlvl" = dta.2b$urbconlvl, "_t" = dta.2b$`_t`, "_t0" = dta.2b$`_t0`, 
                         "regimefail" = dta.2b$regimefail, "fivemillion" = dta.2b$fivemillion, "gle_pop" = dta.2b$gle_pop,
                         "wr_mir" = dta.2b$wr_mir,  "wr_mor" = dta.2b$wr_mor, "wr_spr" = dta.2b$wr_spr,  "leg" = dta.2b$leg, 
                         "r_atlas" = dta.2b$r_atlas,  "econcrisispwt" = dta.2b$econcrisispwt,  "coldwar" = dta.2b$coldwar,  
                         "prevtimesinoffice" = dta.2b$prevtimesinoffice, "lgle_rgdp" = dta.2b$lgle_rgdp, "gdpgrow" = dta.2b$gdpgrow,  
                         "conflict" = dta.2b$conflict,  "lgle_pop" = dta.2b$lgle_pop,  "lcapitalpop" = dta.2b$lcapitalpop,  
                         "avgpop" = dta.2b$avgpop,  "chga_roy" = dta.2b$chga_roy, "chga_mil" = dta.2b$chga_mil,
                         "pwt_openc" = dta.2b$pwt_openc,"overurb" = dta.2b$overurb,
                         "pwt_er" = dta.2b$pwt_er, "ucdp_count" = dta.2b$ucdp_count, "unna_er" = dta.2b$unna_er, 
                         "migrestrict1" = dta.2b$migrestric1,  "lurbpop" = dta.2b$lurbpop, "overurb2" = dta.2b$overurb2, 
                         "interstatewar" = dta.2b$interstatewar,  
                         "econcrisis" = dta.2b$econcrisis,  "p_polity2" = dta.2b$p_polity2, 
                         "SubAf" = dta.2b$SubAf, "WEuNAm" = dta.2b$WEuNAm, "nevent" = dta.2b$nevent, "eventdummy" = dta.2b$eventdummy,
                         "ubrra" = dta.2b$ubrra, "ccode" = dta.2b$ccode, "year" = dta.2b$year, "surv.obj" = dta.2b$surv.obj)

banks_interest <- cbind("urbcon" = banks$urbcon, "urbconlvl" = banks$urbconlvl, "_t" = banks$`_t`, "_t0" = banks$`_t0`, 
                         "regimefail" = banks$regimefail, "fivemillion" = banks$fivemillion, "gle_pop" = banks$gle_pop,
                         "wr_mir" = banks$wr_mir,  "wr_mor" = banks$wr_mor, "wr_spr" = banks$wr_spr,  "leg" = banks$leg, 
                         "r_atlas" = banks$r_atlas,  "econcrisispwt" = banks$econcrisispwt,  "coldwar" = banks$coldwar,  
                        "prevtimesinoffice" = banks$prevtimesinoffice, "lgle_rgdp" = banks$lgle_rgdp, "gdpgrow" = banks$gdpgrow,  
                        "conflict" = banks$conflict,  "lgle_pop" = banks$lgle_pop,  "lcapitalpop" = banks$lcapitalpop,  
                        "chga_roy" = banks$chga_roy, "chga_mil" = banks$chga_mil, 
                        "pwt_openc" = banks$pwt_openc,"overurb" = banks$overurb,
                        "pwt_er" = banks$pwt_er, "ucdp_count" = banks$ucdp_count, "unna_er" = banks$unna_er,
                        "migrestrict1" = banks$migrestrict1,  "lurbpop" = banks$lurbpop, "overurb2" = banks$overurb2, 
                        "interstatewar" =banks$interstatewar, "econcrisis" = banks$econcrisis, "s19f5" = banks$s19f5, 
                        "s20f5" = banks$s20f5, "s21f2" = banks$s21f2 ,"s22f2" = banks$s22f2 ,  "p_polity2"=banks$p_polity2,
                        "SubAf"= banks$SubAf, "WEuNAm" = banks$WEuNAm,
                        "s21f7" = banks$s21f7 ,"s22f3" = banks$s22f3, "banksconflict0" = banks$banksconflict0, 
                        "banksdummy" = banks$banksdummy, "ccode" = banks$ccode, "year" = banks$year)

# Create MI Data sets
mi_dta1 <- mice(dta_1_interest)
mi_dta2a <- mice(dta_2a_interest)
mi_dta2b <- mice(dta_2b_interest)
mi_banks <- mice(banks_interest)

dta1_pairs <-md.pairs(dta_1_interest)
dta2a_pairs <-md.pairs(dta_2a_interest)
dta2b_pairs <- md.pairs(dta_2b_interest)
banks_pairs <- md.pairs(banks_interest)


######################################
# Table 1 with MI and LASSO Covariates
######################################

# Logit Model 1.1
fit.11.imp <- with(mi_dta2a, glm(eventdummy~urbcon+gdpgrow+pwt_openc + overurb+factor(year)+factor(ccode),
              family=binomial(link='logit')))
summary(pool(fit.11.imp))


# Negative Binomial Model 1.2
fit.12.imp <- with(mi_dta2a, glm.nb(nevent~urbcon+gdpgrow +pwt_er + ucdp_count + unna_er+migrestrict1 +
                                      factor(ccode)+factor(year)))
summary(pool(fit.12.imp))

fit.13.imp <- with(mi_banks, glm(banksdummy ~ urbcon+gdpgrow+lgle_rgdp + lurbpop + overurb2 + interstatewar + 
                                   econcrisis + s19f5 + s20f5 + s21f2 + s22f2 + factor(year)+factor(ccode)))
summary(pool(fit.13.imp))

fit.14.imp <- with(mi_banks, glm.nb(banksconflict0~urbcon+gdpgrow+ p_polity2 + ucdp_count + lurbpop + lgle_pop + conflict 
                                     + s21f2 + s21f7 + s22f3 + factor(year)+factor(ccode)))
summary(pool(fit.14.imp))


# Code from https://datascienceplus.com/imputing-missing-data-with-r-mice-package/
pdf("missingness_dta1.pdf")
aggr(dta_1_interest, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(dta_1_interest), cex.axis=.5, gap=3, ylab=c("Histogram of Missing Data - dta1","Pattern"))
dev.off()

pdf("missingness_dta2a.pdf")
aggr(dta_2a_interest, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(dta_2a_interest), cex.axis=.5, gap=3, ylab=c("Histogram of Missing Data - dta2a","Pattern"))
dev.off()

pdf("missingness_dta2b.pdf")
aggr(dta_2b_interest, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(dta_2b_interest), cex.axis=.5, gap=3, ylab=c("Histogram of missing data - dta2b","Pattern"))
dev.off()

pdf("missingness_banks.pdf")
aggr(banks_interest, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(banks_interest), cex.axis=.5, gap=3, ylab=c("Histogram of missing data - Banks","Pattern"))
dev.off()







